library(timetk)
library(inspectdf)
library(janitor)
library(readr)
library(dplyr)
library(readr)
library(ggplot2)
library(naniar)
library(packcircles)
library(ggridges)
library(ggbeeswarm)
library(patchwork)
library(dplyr)
library(tidyr)
library(readr)
library(skimr)
library(purrr)
library(stringr)
library(urltools)
library(magrittr)
library(lubridate)
library(janitor)
library(tseries)
library(performance)
Az adatsor Tetouan City 2017-es áramfogyasztásának alakulását írja le 10 percenkénti bontásban 3 zónára. Ebben a notebookban előkészítem az adatot elemzésre, aztán elemzem és végül egy idősoros előrejelzést fogok csinálni.
Salam, A., & El Hibaoui, A. (2018, December). Comparison of Machine Learning Algorithms for the Power Consumption Prediction:-Case Study of Tetouan city–. In 2018 6th International Renewable and Sustainable Energy Conference (IRSEC) (pp. 1-5). IEEE
az áramfogyasztás előrejelzése idősor elemzéssel
Használjuk a linket githubról, hogy ne kelljen a lokális fájllal dolgozni.
url <-
"https://raw.githubusercontent.com/fzksmrk/adatbanyaszati-beadando-2/main/data.csv"
data <- readr::read_csv(
url,
col_types = cols(
DateTime = col_datetime(format = "%m/%d/%Y %H:%M"),
Temperature = col_double(),
Humidity = col_double(),
`Wind Speed` = col_double(),
`general diffuse flows` = col_double(),
`diffuse flows` = col_double(),
`Zone 1 Power Consumption` = col_double(),
`Zone 2 Power Consumption` = col_double(),
`Zone 3 Power Consumption` = col_double()
)
)
data
A janitor csomag használatával nevezzük át az oszlopokat, hogy azok “snake” formátumúak legyenek.
Szintén egyszerűsítsük (rövidítsük) az áram felhasználására vonatkozó oszlop nevét.
data <- janitor::clean_names(data)
data <- dplyr::rename(data,"zone_1" = "zone_1_power_consumption","zone_2" = "zone_2_power_consumption","zone_3" = "zone_3_power_consumption")
data
Adjuk össze a 3 zóna áramfelhasználását, hogy megkapjuk a város teljes áramfelhasználását.
data <- data %>%
mutate(data, zone_total = zone_1 + zone_2 + zone_3)
Idősoros elemzésnél fontos lehet, hogy a dátum további dimenzióit is megvizsgáljuk (például a hét napjait), ezért a timetk csomaggal kibővítjük az idősor dimenzióit. Ami nem fontos most, azokat eltávolítom.
data <- data %>%
timetk::tk_augment_timeseries_signature(date_time) %>%
select(
-matches(
"(half)|(mday)|(qday)|(mday)|(yday)|(mweek)|(xts)|(second)|(minute)|(iso)|(num)|(hour12)|(am.pm)|(week\\d)|(mday7)"
)
) %>%
select(-diff,-wday) %>%
mutate(hour = factor(hour, ordered = TRUE))
inspectdf::inspect_num(data)
data %>%
ggplot(aes(date_time, temperature)) +
geom_line() +
labs(
x = NULL,
y = "Temperature"
) -> p1
data %>%
ggplot(aes(date_time, humidity)) +
geom_line() +
labs(
x = NULL,
y = "Humidity"
) -> p2
data %>%
ggplot(aes(date_time, wind_speed)) +
geom_line() +
labs(
x = NULL,
y = "Wind Speed"
) -> p3
data %>%
select(date_time, contains("flows")) %>%
pivot_longer(-date_time) %>%
ggplot(aes(date_time, value)) +
geom_line(aes(color = name)) +
labs(
x = NULL,
y = "Flows",
color = "Flows"
) -> p4
p1 / p2 / p3 / p4
Jelentős változást láthatunk a hőmérséklet változásában augusztus környékén.
data %>%
select(date_time, contains("zone")) %>%
pivot_longer(-date_time) %>%
ggplot(aes(date_time, value)) +
geom_line(aes(color = name)) +
scale_y_continuous(labels = scales::label_number_si()) +
labs(
x = NULL,
y = "Power",
color = "Zone"
)
A zone 3-ban kiugrást láthatunk augusztus környékén, amit a kimagasló hőmérsékletnek tudnék be.
ggplot(data, aes(x = temperature, y = zone_3)) +
geom_point(aes(color = month.lbl), alpha = 0.5) +
labs(title = "Temperature vs Zone 3 Power Consumption",
x = "Temperature",
y = "Zone 3 Power Consumption"
)
Láthatjuk, hogy a nyári hónapok tényleg magasabb hőmérséklettel és magasabb energie felhasználással is jártak
és láthatjuk, hogy a zone 1-ben van egy “váratlan” csökkenés június környékén.
data %>%
filter(date_time > "2017-06-20", date_time < "2017-07-02") %>%
select(date_time, zone_1) %>%
pivot_longer(-date_time) %>%
ggplot(aes(date_time, value)) +
geom_vline(xintercept = as.POSIXct("2017-06-26"), color = "black", lty = 2) +
geom_line(aes(color = name)) +
scale_y_continuous(labels = scales::label_number_si()) +
labs(
x = NULL,
y = "Power",
color = "Zone"
)
Vajon ez miért történhetett - nézzük meg a hőmérséklet változást
data %>%
filter(date_time > "2017-06-20", date_time < "2017-07-02") %>%
select(date_time, temperature) %>%
pivot_longer(-date_time) %>%
ggplot(aes(date_time, value)) +
geom_vline(xintercept = as.POSIXct("2017-06-26"), color = "black", lty = 2) +
geom_line(aes(color = name)) +
scale_y_continuous(labels = scales::label_number_si())
Érdekes, hogy a hőmérsékletben 2 nappal korábban láthatunk változást, lehet érdemes lenne megvizsgálni, hogy van-e összefüggés. (máskor)
És nézzük a trendeket:
data %>%
select(hour, zone_1,zone_2,zone_3) %>%
pivot_longer(-hour) %>%
ggplot(aes(hour, value)) +
geom_boxplot(aes(color = name),
outlier.alpha = 0.1) -> p1
data %>%
select(wday.lbl, zone_1,zone_2,zone_3) %>%
pivot_longer(-wday.lbl) %>%
ggplot(aes(wday.lbl, value)) +
geom_boxplot(aes(color = name),
outlier.alpha = 0.1) -> p2
data %>%
select(month.lbl, zone_1,zone_2,zone_3) %>%
pivot_longer(-month.lbl) %>%
ggplot(aes(month.lbl, value)) +
geom_boxplot(aes(color = name),
outlier.alpha = 0.1) -> p3
p1/ p2 / p3
decompose_ts <- function(series, freq = 144) {
ts_obj <- ts(series, frequency = freq)
decompose(ts_obj)
}
plot(decompose_ts(data$zone_1))
A dekompozíció 3 komponensre bontja az idősorunkat: trend / seasonal / random
Talán a randomban látható nyár közeli kitérést emelném ki, mint érdekesség.
plot(decompose_ts(data$zone_total))
plot(decompose_ts(data$temperature))
az áramfogyasztás előrejelzése idősor elemzéssel
Hozzuk létre az idősorunkat
zone_1_ts <- ts(data$zone_1, frequency = 144)
Elemezzük, hogy az adatunk stacionárius-e
adf_test <- adf.test(zone_1_ts)
## Warning in adf.test(zone_1_ts): p-value smaller than printed p-value
print(adf_test$p.value)
## [1] 0.01
Mivel a p érték kevesebb, mint 0.05, ezért a nullhipotézist elvethetjük és elfogadhatjuk, hogy az adatunk stacionárius
acf(zone_1_ts)
pacf(zone_1_ts)
library(forecast)
fit <- auto.arima(zone_1_ts, seasonal = TRUE)
summary(fit)
## Series: zone_1_ts
## ARIMA(3,0,0)(0,1,0)[144]
##
## Coefficients:
## ar1 ar2 ar3
## 1.0507 -0.1566 0.0816
## s.e. 0.0044 0.0063 0.0044
##
## sigma^2 = 187691: log likelihood = -391528.2
## AIC=783064.5 AICc=783064.5 BIC=783100
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.162183 432.6257 264.0267 -0.01083862 0.8469455 0.2029934
## ACF1
## Training set -0.0007429365
forecast_values <- forecast(fit, h = 77*2*5)
lasts <- tail(zone_1_ts, 77*2*10)
# Create a data frame for the historical data
history_df <- data.frame(Time = time(lasts), Value = as.numeric(lasts), Lo95 = NA, Hi95 = NA, Type = "Historical")
# Create a data frame for the forecasted data
forecast_df <- data.frame(Time = time(forecast_values$mean),
Value = as.numeric(forecast_values$mean),
Lo95 = as.numeric(forecast_values$lower[,2]),
Hi95 = as.numeric(forecast_values$upper[,2]),
Type = "Forecast")
# Ensure that 'Time' columns are of the same class
history_df$Time <- as.numeric(history_df$Time)
forecast_df$Time <- as.numeric(forecast_df$Time)
# Combine the historical and forecasted data into a single data frame
combined_df <- rbind(history_df, forecast_df)
# Create a ggplot of the historical and forecasted values
ggplot() +
geom_line(data = combined_df, aes(x = Time, y = Value, color = Type)) +
geom_line(data = forecast_df, aes(x = Time, y = Lo95), linetype = "dotted", color = "blue") +
geom_line(data = forecast_df, aes(x = Time, y = Hi95), linetype = "dotted", color = "blue") +
scale_color_manual(values = c("Historical" = "black", "Forecast" = "red")) +
labs(title = "ARIMA Forecast", x = "Time", y = "Value") +
theme_minimal()
zone_ts <- ts(data$zone_total, frequency = 144)
fit_zone <- auto.arima(zone_ts, seasonal = TRUE)
summary(fit_zone)
## Series: zone_ts
## ARIMA(4,0,0)(0,1,0)[144]
##
## Coefficients:
## ar1 ar2 ar3 ar4
## 1.0733 -0.1280 0.0500 -0.0108
## s.e. 0.0044 0.0064 0.0064 0.0044
##
## sigma^2 = 349552: log likelihood = -407780.2
## AIC=815570.4 AICc=815570.4 BIC=815614.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.2655549 590.3941 406.7471 -0.004305024 0.5934085 0.1754785
## ACF1
## Training set 4.183627e-05
p:4
d:0
q:0
forecast_values <- forecast(fit_zone, h = 77*2*5)
lasts <- tail(zone_ts, 77*2*10)
# Create a data frame for the historical data
history_df <- data.frame(Time = time(lasts), Value = as.numeric(lasts), Lo95 = NA, Hi95 = NA, Type = "Historical")
# Create a data frame for the forecasted data
forecast_df <- data.frame(Time = time(forecast_values$mean),
Value = as.numeric(forecast_values$mean),
Lo95 = as.numeric(forecast_values$lower[,2]),
Hi95 = as.numeric(forecast_values$upper[,2]),
Type = "Forecast")
# Ensure that 'Time' columns are of the same class
history_df$Time <- as.numeric(history_df$Time)
forecast_df$Time <- as.numeric(forecast_df$Time)
# Combine the historical and forecasted data into a single data frame
combined_df <- rbind(history_df, forecast_df)
# Create a ggplot of the historical and forecasted values
ggplot() +
geom_line(data = combined_df, aes(x = Time, y = Value, color = Type)) +
geom_line(data = forecast_df, aes(x = Time, y = Lo95), linetype = "dotted", color = "blue") +
geom_line(data = forecast_df, aes(x = Time, y = Hi95), linetype = "dotted", color = "blue") +
scale_color_manual(values = c("Historical" = "black", "Forecast" = "red")) +
labs(title = "ARIMA Forecast", x = "Time", y = "Value") +
theme_minimal()
Hasonló elemzés a timetk csomag használatával
data %>%
select(date_time,
month.lbl,
temperature,
day,
hour,
week,
zone_total) %>%
pivot_longer(-date_time:-week, names_to = "zone") %>%
group_by(zone) %>%
plot_time_series_regression(
.date_var = date_time,
value ~ month.lbl+ temperature + as.factor(day) + hour + week + lag(value) ,
.show_summary = TRUE,
.interactive = FALSE
) -> p
##
## Summary for Group: zone_total---
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19133.7 -402.7 -13.3 343.7 10079.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.068e+03 3.808e+02 5.431 5.64e-08 ***
## month.lbl.L 2.653e+03 7.898e+02 3.359 0.000784 ***
## month.lbl.Q -1.288e+02 3.078e+01 -4.186 2.84e-05 ***
## month.lbl.C -7.666e+01 1.897e+01 -4.041 5.34e-05 ***
## month.lbl^4 1.364e+02 1.796e+01 7.599 3.04e-14 ***
## month.lbl^5 8.888e+01 1.592e+01 5.584 2.36e-08 ***
## month.lbl^6 -8.744e+01 1.580e+01 -5.534 3.14e-08 ***
## month.lbl^7 -8.554e+01 1.577e+01 -5.424 5.84e-08 ***
## month.lbl^8 1.659e+01 1.494e+01 1.110 0.266812
## month.lbl^9 8.548e+01 1.562e+01 5.472 4.46e-08 ***
## month.lbl^10 3.584e+01 1.499e+01 2.391 0.016823 *
## month.lbl^11 -1.851e+00 1.500e+01 -0.123 0.901787
## temperature 7.006e+00 1.798e+00 3.896 9.77e-05 ***
## as.factor(day)2 2.879e+01 3.337e+01 0.863 0.388284
## as.factor(day)3 4.713e+01 3.368e+01 1.399 0.161734
## as.factor(day)4 6.964e+01 3.392e+01 2.053 0.040057 *
## as.factor(day)5 7.311e+01 3.484e+01 2.098 0.035867 *
## as.factor(day)6 8.738e+01 3.527e+01 2.478 0.013231 *
## as.factor(day)7 9.583e+01 3.572e+01 2.683 0.007305 **
## as.factor(day)8 1.084e+02 3.673e+01 2.952 0.003163 **
## as.factor(day)9 1.011e+02 3.783e+01 2.672 0.007553 **
## as.factor(day)10 1.186e+02 3.910e+01 3.034 0.002414 **
## as.factor(day)11 1.248e+02 3.979e+01 3.136 0.001714 **
## as.factor(day)12 1.347e+02 4.200e+01 3.206 0.001346 **
## as.factor(day)13 1.548e+02 4.285e+01 3.612 0.000305 ***
## as.factor(day)14 1.428e+02 4.359e+01 3.276 0.001055 **
## as.factor(day)15 1.641e+02 4.528e+01 3.625 0.000289 ***
## as.factor(day)16 1.730e+02 4.705e+01 3.677 0.000236 ***
## as.factor(day)17 1.833e+02 4.889e+01 3.749 0.000178 ***
## as.factor(day)18 1.969e+02 4.984e+01 3.950 7.84e-05 ***
## as.factor(day)19 1.967e+02 5.269e+01 3.732 0.000190 ***
## as.factor(day)20 2.059e+02 5.369e+01 3.836 0.000125 ***
## as.factor(day)21 1.971e+02 5.464e+01 3.607 0.000309 ***
## as.factor(day)22 2.048e+02 5.663e+01 3.617 0.000298 ***
## as.factor(day)23 2.277e+02 5.874e+01 3.876 0.000106 ***
## as.factor(day)24 2.333e+02 6.084e+01 3.834 0.000126 ***
## as.factor(day)25 2.538e+02 6.197e+01 4.096 4.21e-05 ***
## as.factor(day)26 2.239e+02 6.508e+01 3.441 0.000580 ***
## as.factor(day)27 2.478e+02 6.622e+01 3.741 0.000183 ***
## as.factor(day)28 2.368e+02 6.734e+01 3.517 0.000437 ***
## as.factor(day)29 2.410e+02 6.981e+01 3.452 0.000556 ***
## as.factor(day)30 2.402e+02 7.215e+01 3.329 0.000872 ***
## as.factor(day)31 2.502e+02 7.752e+01 3.228 0.001247 **
## hour.L 1.041e+03 4.531e+01 22.982 < 2e-16 ***
## hour.Q -2.804e+03 2.510e+01 -111.749 < 2e-16 ***
## hour.C -1.001e+03 2.904e+01 -34.479 < 2e-16 ***
## hour^4 -1.213e+03 2.126e+01 -57.068 < 2e-16 ***
## hour^5 -1.204e+03 2.293e+01 -52.486 < 2e-16 ***
## hour^6 2.314e+02 2.411e+01 9.598 < 2e-16 ***
## hour^7 2.115e+03 2.094e+01 100.993 < 2e-16 ***
## hour^8 4.509e+02 2.271e+01 19.855 < 2e-16 ***
## hour^9 -8.617e+02 2.100e+01 -41.036 < 2e-16 ***
## hour^10 -3.220e+02 2.096e+01 -15.362 < 2e-16 ***
## hour^11 -4.009e+02 2.092e+01 -19.163 < 2e-16 ***
## hour^12 -2.424e+02 2.102e+01 -11.534 < 2e-16 ***
## hour^13 5.543e+02 2.095e+01 26.465 < 2e-16 ***
## hour^14 3.411e+02 2.093e+01 16.295 < 2e-16 ***
## hour^15 -5.771e+01 2.092e+01 -2.758 0.005816 **
## hour^16 -1.980e+01 2.092e+01 -0.947 0.343792
## hour^17 -8.355e+01 2.091e+01 -3.995 6.48e-05 ***
## hour^18 9.575e+01 2.091e+01 4.578 4.70e-06 ***
## hour^19 2.426e+02 2.092e+01 11.597 < 2e-16 ***
## hour^20 2.547e+02 2.091e+01 12.179 < 2e-16 ***
## hour^21 7.526e+01 2.091e+01 3.599 0.000320 ***
## hour^22 1.245e+02 2.091e+01 5.955 2.63e-09 ***
## hour^23 -4.622e+01 2.091e+01 -2.210 0.027092 *
## week -5.179e+01 1.520e+01 -3.408 0.000654 ***
## lag(value) 9.861e-01 6.957e-04 1417.396 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 977.3 on 52347 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.9968, Adjusted R-squared: 0.9967
## F-statistic: 2.399e+05 on 67 and 52347 DF, p-value: < 2.2e-16
##
## ----
p
## Warning: Removed 1 row containing missing values (`geom_line()`).
nagyítsunk bele, hogy lássuk ahogyan a tényleges és a becsült értékek valójában eltérnek néhol, de “egész jól illeszkednek”
p +
scale_x_datetime(limits = c(as.POSIXct("2017-06-25 00:00:00"),
as.POSIXct("2017-06-30 00:00:00"))) +
labs(title = "Power vs Prediction, Zoomed In")
## Warning: Removed 103390 rows containing missing values (`geom_line()`).
data <- data %>%
mutate(
month.lbl = as.character(month.lbl),
day = as.factor(day),
hour = as.character(hour)
)
mod_base <- lm(
formula = zone_1 ~ month.lbl + temperature + day + hour + week + lag(zone_1),
data = data
)
performance::model_performance(mod_base)
Láthatjuk, hogy milyen jól viseledik ez az idősor. A továbbiakban érdemes lehetne megvizsgálni, hogy hogyan lehet az összefüggést feltárni a hőmérséklet és az áramfogyasztás között. A feltételezésem, hogy a magas hőmérsékletben sok áramot fogyasztanak a légkondícionálásra. Érdekes lehet továbbá kibővíteni az elemzést a személyek számával, hogyan ki hol tevékenykedik a nap során stb.